Qiime2 uses a compressed type of file format called an ‘Artifact’ for its analyses. Artifacts have different semantic types e.g. FeatureData[Sequence], Phylogeny[Unrooted] depending on the type of data they contain. To begin the analysis, the fastq were imported files into FeatureData[SequencesWithQuality] or FeatureData[PairedEndSequencesWithQuality] artifacts
# Relevant commands
qiime tools import \
--type 'SampleData[PairedEndSequencesWithQuality]' \
--input-path devon.tsv \
--output-path devonFQ.qza \
--input-format PairedEndFastqManifestPhred64V2
qiime tools import \
--type 'SampleData[SequencesWithQuality]' \
--input-path neem.tsv \
--output-path neem.qza \
--input-format SingleEndFastqManifestPhred33V2
The MicFunPred and NCBI databases were obtained in the standard BLAST format, and need to be converted into compatible data types for import into the qiime2 workflow. Specficially, I needed to convert them into FASTA format with an associated taxonomy mapping file (in HeaderlessTSVTaxonomyFormat)
# 1
blastdbcmd -db NCBI_16S/16S_ribosomal_RNA -entry all > NCBI_16S.fasta
blastdbcmd -db micfun/micfun16S -entry all > micfun.fasta
# 2
grep '>' NCBI_16S.fasta | tr -d '>' | sed 's/ /\t/' | sed 's/ /_/g' > NCBI_16SID.txt
grep '>' micfun.fasta | tr -d '>' | sed 's/_/\t/' | sort | uniq > micfunID.txt # Unfortunately several of the headers repeat
# 3
cat micfun.fasta | sed 's/_.*//' > micfunID.fasta
cat NCBI_16S.fasta | sed 's/ .*//' > ncbi16sID.fasta
# 4
cat micfunID.txt NCBI_16SID.txt EzBioCloud/ezbiocloud_id_taxonomy.txt > all_mappings.txt
cat EzBioCloud/ezbiocloud_qiime_full.fasta ncbi16sID.fasta micfun.fasta > all.fasta
qiime tools import --type FeatureData[Taxonomy] --input-format HeaderlessTSVTaxonomyFormat --input-path all.fasta --output-path all_fasta.qza
qiime tools import --type FeatureData[Taxonomy] --input-format HeaderlessTSVTaxonomyFormat --input-path all_uniqIDs.txt --output-path Ids.qza
# Script for removing redundant ids
from Bio import SeqIO
import csv
exists: set = set()
mapped = open('all_uniq.fasta', 'w+')
for seq in SeqIO.parse('all.fasta', 'fasta'):
if seq.id in exists:
continue
exists.add(seq.id)
mapped.write(f'>{seq.id}\n')
mapped.write(f'{seq.seq}\n')
mapped.close
uniq = open('all_uniqIDs.txt', 'w+')
exists2: set = set()
with open('all_mappings.txt', 'r') as i:
for id in csv.reader(i, delimiter='\t'):
if id[0] in exists2:
continue
exists2.add(id[0])
uniq.write(f'{id[0]}\t{id[1]}\n')
uniq.close
alpha <- get_artifact_data("./results/7-Diversity",
id_key,
extension = "",
metric_list = alpha_metrics
)
faith <- data.frame(row.names = 1:10)
for (site in names(id_key)) {
faith[[id_key[[site]]]] <- alpha[[site]]$fa[, 2]
}
faith_div_plot <- faith %>%
gather() %>%
ggplot(aes(x = value, y = key, fill = key)) +
ggridges::geom_density_ridges2() +
guides(fill = "none") +
labs(y = "Site", x = "Faith diversity")
faith_div_plot
## Picking joint bandwidth of 2.34
pi_evenness <- data.frame(row.names = 1:10)
for (site in names(id_key)) {
pi_evenness[[id_key[[site]]]] <- alpha[[site]]$pi[, 1]
}
even_plot <- pi_evenness %>%
gather() %>%
ggplot(aes(x = key, y = value, fill = key)) +
geom_boxplot() +
guides(fill = "none") +
labs(x = "Site", y = "Evenness", title = "Pielou evenness")
even_plot
It’s pretty clear that some sites have much lower evenness than others. Others, like the Barrow mountain sites, Bihor mountains and Catriona snow seem similar from the box plot. Whether or not the difference in evenness is statistically significant can be tested with Kruskal wallis
pi_evenness %>%
select(c(
"Barrow mountain high", "Barrow mountain low",
"Bihor mountains", "Catriona snow"
)) %>%
kruskal.test()
##
## Kruskal-Wallis rank sum test
##
## data: .
## Kruskal-Wallis chi-squared = 3.8824, df = 3, p-value = 0.2744
Beta diversity quantifies the distance/dissimilarity between sites and is measured on a scale of 0 (identical) to 1 (completely different).
otu_freqs <- lapply(
get_artifact_data("./results/2-OTUs", id_key, "otuFreqs"),
as.data.frame
)
fasttree <- get_artifact_data(
# Approximate maximum likelihood trees, quick and useful for testing data
"./results/6-RootedTrees", id_key,
"FastTree_RootedTree"
)
iqtree <- get_artifact_data(
# Real maximum likelihood trees, accurate, but slow
"./results/6-RootedTrees", id_key,
"IQTREE_RootedTree"
)
# Annotate an example tree
matched <- match(iqtree$GrI$tip.label, rownames(otu_freqs$GrI))
freq_mapping <- otu_freqs$GrI[matched, ] %>%
rowMeans() %>%
as.data.frame() %>%
`rownames<-`(seq_along(rownames(.)) %>% paste("OTU", ., sep = ""))
iqtree$GrI$tip.label <- paste(rownames(freq_mapping), "freq =", freq_mapping[[1]])
sample_tree <- iqtree$GrI %>%
ggtree(layout = "roundrect", aes(color = "#A4E473")) +
geom_tiplab(size = 3, color = "#004651") +
geom_tippoint(color = "#66CC8A") +
labs(title = "Phylogenetic tree of Cryoconite samples") +
theme(legend.position = "none", axis.text = element_text(size = 14))
sample_tree
Beta diversity calculations on multiple sites at once returns a distance matrix, where the first row and column are sites and entries are. This can be depicted using ordination methods
beta <- get_artifact_data("./results/7-Diversity",
list(Merged = NULL),
extension = "",
metric_list = beta_metrics
)
pcoa2D <- get_artifact_data("./results/8-Analysis",
list(Merged = NULL),
extension = "PCOA-2D_",
metric_list = beta_metrics
)
pcoa2D_merged <- lapply(pcoa2D$Merged, metadata_merge_pcoa, metadata = metadata)
pcoaja <- plot_pcoa(pcoa2D_merged$ja, "Location") +
labs(x = "PC1", y = "PC2", title = "Jaccard distance")
pcoabc <- plot_pcoa(pcoa2D_merged$bc, "Location") +
labs(x = "PC1", y = "PC2", title = "Bray Curtis")
pcoauu <- plot_pcoa(pcoa2D_merged$uu, "Location") +
labs(x = "PC1", y = element_blank(), title = "Unwieghted Unifrac")
pcoawn <- plot_pcoa(pcoa2D_merged$wn, "Location") +
labs(x = "PC1", y = element_blank(), title = "Weighted normalized Unifrac")
pcoa_arrange <- ggarrange(pcoabc, pcoauu, pcoawn,
ncol = 3,
common.legend = TRUE,
legend = "right"
)
pcoa_arrange
- The pcoa plots depict clearly how the choice of distance metric affects the clustering of samples. - Clustering in the Bray Curtis plot represents sites sharing many of the same species and in similar abundances. The big cluster falls apart once we factor the evolutionary distance between OTUs, shown by the Unifrac metrics. - This implies that although some taxa are shared, the unique taxa are a evolutionary distant (essentially have very different DNA) from the shared ones. - Once we add weightings by abundance though, new clusters form, indicating there are many more common taxa within the groups than there are unique taxa. - Even partitioning sites by habitat type - glacier and permafrost, doesn’t help
# Filter the sites
keep <- c("ViS", "BrL", "BrH", "SvG")
filter_wn <- filter_dm(beta$Merged$wn, keep)
filtered_meta <- filter_meta(metadata, keep)
Within the chosen cluster, its visually unclear whether or not the differences between sites are significant. There are specific hypothesis tests for this problem, such as Permutational Analysis of Variance (PERMANOVA) and Analysis of Similarities (ANOSIM)
TODO:what is the difference between them?
adonis2(filter_wn ~ filtered_meta$Location)
anosim(filter_wn, grouping = filtered_meta$Location)
##
## Call:
## anosim(x = filter_wn, grouping = filtered_meta$Location)
## Dissimilarity:
##
## ANOSIM statistic R: 0.3687
## Significance: 0.001
##
## Permutation: free
## Number of permutations: 999
# Export for far pro tax database
otu_genus <- list()
for (id in names(id_key)) {
otu_genus[[id]] <- to_genus_csv(otu_freqs[[id]], blast[[id]])
}
genus_combined <- combine_freqs(otu_genus, taxon)
write.csv(combined, "genus_otu_tables.csv", row.names = FALSE)
The raw output tables here are the absolute abundances of inferred biological pathways in each site based on the MetaCyc database. PICRUSt2 works by first predicting important reactions for metabolism in the site (using KEGG Orthology (KO) and Enzyme Commission numbers (EC)) format, then using their abundances for pathway inference. A pathway is essentially a set of reactions working together for a specific purpose, such as energy storage or synthesis. - Originally I had planned to use the farprotax database, but the script the authors provided didn’t work. - I encountered another problem where PICRUSt2 failed to analyze the merged dataset so I had to combine the output tables from each sample.
ko_all <- ko %>%
# Merge the PICRUSt2 tables
reduce(merge, by = "pathway", all = TRUE) %>%
as_tibble() %>%
replace(is.na(.), 0) %>%
rel_abund(., pathway) %>% # Convert to relative abundances
as_tibble()
message(glue("There are {dim(ko_all)[1]} inferred pathways"))
## There are 438 inferred pathways
ko_xfunc <- ko_all %>% sites_x_func() # Transpose into sites x function format
# Compute bray curtis distance, then plot pcoa
bc_func <- vegdist(ko_xfunc, method = "bray")
pcoa_bc_func <- bc_func %>%
wcmdscale(k = 2) %>%
metadata_merge_pcoa(metadata, ., functions = TRUE)
## Joining with `by = join_by(sample.id)`
# Compute jaccard distance
ja_func <- vegdist(ko_xfunc, method = "jaccard")
pcoa_ja_func <- ja_func %>%
wcmdscale(k = 2) %>%
metadata_merge_pcoa(metadata, ., functions = TRUE)
## Joining with `by = join_by(sample.id)`
# Plot and compare with ordination on taxonomy
plot_ja_func <- plot_pcoa(pcoa_ja_func, "Location", functions = TRUE) +
labs(
x = element_blank(), y = element_blank(), title = "Jaccard distance",
subtitle = "From biological pathways"
)
plot_bc_func <- plot_pcoa(pcoa_bc_func, "Location", functions = TRUE) +
labs(
x = "PC1", y = element_blank(), title = "Jaccard distance",
subtitle = "From biological pathways"
)
func_compare <- ggarrange(pcoaja + labs(x = element_blank()), plot_ja_func, pcoabc, plot_bc_func,
ncol = 2, nrow = 2, common.legend = TRUE, legend = "bottom"
) + theme(axis.text = element_text(size = 14))
func_compare
For sample taxonomic classification, I will be trying out all three of the methods available in qiime2:
blast <- lapply(
get_artifact_data("./results/3-Classified", id_key, "BLAST_All"),
parse_taxonomy
)
sklearn <- lapply(
get_artifact_data("./results/3-Classified", id_key, "Sklearn"),
parse_taxonomy
)
sk_merged <- read_qza("./results/3-Classified/Merged-Sklearn.qza")$data %>%
parse_taxonomy()
count_identified(sklearn, "Sklearn")
## Sklearn: 0.584407164275873
count_identified(blast, "BLAST")
## BLAST: 0.579451912055851
sk_merged
The sklearn classifier has a slightly better number of identifications so will be used for all any downstream analyses
Finally, I will perform tests on the abundance of identified OTUs and the predicted pathways in the previously selected sites. Differential abundance analysis this refers to identifying which otus are more or less present between different samples. Traditional statistical tests are inappropriate for this application generally because the data is so sparse.
I will be using the Analysis of Microbiomes with Bias correction test (ANCOM-BC). To get an idea of what to expect from the test, I’ll produce a bar plot depicting the relative phylum-level abundances between the sites.
all <- read_qza("./results/2-OTUs/Merged-otuFreqs.qza")$data
sk_merged <- read_qza("./results/3-Classified/Merged-Sklearn.qza")$data %>%
parse_taxonomy()
ranks <- merge_with_id(all, sk_merged, level = 2) %>%
filter(!(is.na(taxon))) %>%
group_by(taxon) %>%
summarise(across(everything(), sum))
not_bacteria <- c(
"Arthropoda", "Nanoarchaeota", "Diatomea", "Altiarchaeota",
"Ascomycota", "Basidiomycota", "Cercozoa", "Ciliophora", "Asgardarchaeota",
"Phragmoplastophyta", "Euryarchaeota", "Crenarchaeota"
)
shown_paths <- c(
"CHLOROPHYLL-SYN", "GLYCOLYSIS", "TCA", "CALVIN-PWY",
"PENTOSE-P-PWY", "METHANOGENESIS-PWY", "DENITRIFICATION-PWY", "FERMENTATION-PWY",
"LACTOSECAT-PWY", "METH-ACETATE-PWY"
)
nice_paths <- ko_all %>%
filter((grepl(paste(shown_paths, collapse = "|"), pathway)))
tax_sum <- sum_by_site(ranks, id_key, "taxon", not_bacteria)
path_sum <- sum_by_site(nice_paths, id_key, "pathway", NaN)
stacked <- tax_sum %>% ggplot(., aes(x = name, y = value, fill = identifier)) +
geom_bar(stat = "identity") +
scale_fill_discrete(name = "Phylum") +
scale_color_paletteer_d("pals::glasbey") +
labs(
x = "Site", y = "Relative abundance", title = "Phyla relative abundance",
subtitle = "*Putative phyla and false positives
(non-prokaryotes) removed"
) +
theme(axis.text = element_text(size = 14))
stacked
heat_path <- path_sum %>% ggplot(., aes(x = name, y = identifier, fill = value)) +
geom_tile() +
scale_fill_gradient2(
name = "Relative abundance",
mid = "seagreen1", low = "springgreen", high = "seagreen"
) +
labs(
x = "Site", y = "Pathway",
)
heat_path
Going to the second-most inclusive taxonomic rank was essential here, given that there are 10668 OTUs total. Still, I noticed there were a few identifications that weren’t prokaryotic. These are most likely false positives, given the specificity of the 16s rRNA primers used to sequence the samples.
From the plot, the TODO:Which look abundant?
There wouldn’t be much point to testing differential abundance for sites that we already know to be similar based on the weighted unifrac plots, so I stuck with the cluster of sites that ANOSIM and PERMANOVA were conducted on
# First we prepare the TreeSummarizedExperiment object
tax_info <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
false_positives <- c("Unassigned", "Arthropoda", "Bacteria", "Insecta") # False positives
otus <- read_qza("./results/2-OTUs/Merged-otuFreqs.qza")$data %>%
as.data.frame()
paths <- ko_all %>%
column_to_rownames(., var = "pathway")
# Format the metadata
formatted_meta_paths <- metadata[match(colnames(paths), metadata$sample.id), ] %>%
`rownames<-`(NULL) %>%
column_to_rownames(., var = "sample.id") %>%
sample_data()
phylo_path <- phyloseq(
otu_table(paths, taxa_are_rows = TRUE),
formatted_meta_paths
)
matched_tax <- sk_merged[match(
rownames(otus),
rownames(sk_merged)
), ] %>%
tax_table() %>%
`colnames<-`(tax_info)
## Warning in .local(object): Coercing from data.frame class to character matrix
## prior to building taxonomyTable.
## This could introduce artifacts.
## Check your taxonomyTable, or coerce to matrix manually.
# All of this is necessary because the TSE object won't be
# created properly unless the indices of the rownames match precisely
formatted_meta <- metadata[match(colnames(otus), metadata$sample.id), ] %>%
`rownames<-`(NULL) %>%
column_to_rownames(., var = "sample.id") %>%
sample_data()
rownames(otus) <- rownames(matched_tax)
my_phylo <- phyloseq(
otu_table(otus, taxa_are_rows = TRUE), formatted_meta,
matched_tax
)
tse <- mia::makeTreeSummarizedExperimentFromPhyloseq(my_phylo)
# Now run the test, testing differential abundance between locations
tse_paths <- mia::makeTreeSummarizedExperimentFromPhyloseq(phylo_path)
var <- "Type"
chosen_rank <- "Class"
# Differentially abundant taxa between cryosphere types
abc <- ancombc2(
data = tse, assay_name = "counts", tax_level = chosen_rank,
fix_formula = var, group = var,
pairwise = TRUE
)
lfc <- prepare_abc_lfc(abc, "Type", "res_pair", chosen_rank, false_positives)
write.csv2(abc$res_pair, "./results/8-ANCOM-BC/all_taxon_res.csv", row.names = FALSE)
write.csv2(lfc, "./results/8-ANCOM-BC/taxon_lfc.csv", row.names = FALSE)
abc_paths <- ancombc2(
data = tse_paths, assay_name = "counts", tax_level = "Species",
fix_formula = "Type", pairwise = TRUE, group = "Type"
)
path_lfc <- prepare_abc_lfc(abc_paths, "Type", "res", NA, NA)
write.csv2(abc_paths$res_pair, "./results/8-ANCOM-BC/all_path_res.csv", row.names = FALSE)
write.csv2(path_lfc, "./results/8-ANCOM-BC/path_lfc.csv", row.names = FALSE)
path_lfc
abc_taxon <- read.csv2("./results/8-ANCOM-BC/all_taxon_res.csv")
taxon_all_counts <- abc_taxon %>%
ancombc_select(glue("diff_{var}"), chosen_rank, false_positives) %>%
select(taxon) %>%
unique() %>%
dim()
lfc <- read.csv2("./results/8-ANCOM-BC/taxon_lfc.csv")
percent_abund <- ((length(unique(lfc$taxon)) / taxon_all_counts[1]) %>% round(digits = 2)) * 100
print(glue("Percent of differentially abundant classes: {percent_abund}%"))
## Percent of differentially abundant classes: 64%
lfc <- lfc %>% quartile_filter()
# We keep the taxa with the highest and lowest log-fold changes between the types
abc_plot <- abc_lfc_plot(lfc) + scale_fill_discrete(name = "Class") + labs(x = "Type")
most_abund <- lfc %>%
filter(lfc == max(lfc)) %>%
select(taxon)
message(glue("The most abundant class is {most_abund}"))
## The most abundant class is KD4-96
abc_taxon
abc_plot
KD4-96 is a uncharacterized class of the phylum Chloroflexi
all_path_lfc <- read.csv2("./results/8-ANCOM-BC/all_path_res.csv")
path_lfc <- read.csv2("./results/8-ANCOM-BC/path_lfc.csv")
path_all_counts <- all_path_lfc %>%
ancombc_select(glue("diff_{var}"), NA, NA) %>%
select(taxon) %>%
unique() %>%
dim()
percent_abund <- ((length(unique(path_lfc$taxon)) / path_all_counts[1]) %>% round(digits = 2)) * 100
path_all_counts
## [1] 420 1
print(glue("Percent of differentially abundant pathways: {percent_abund}"))
## Percent of differentially abundant pathways: 30
path_lfc <- path_lfc %>% quartile_filter()
abc_pathways <- abc_lfc_plot(path_lfc) + scale_fill_discrete(name = "Pathway")
most_abund <- path_lfc %>%
filter(lfc == max(lfc)) %>%
select(taxon)
message(glue("The most differentially abundant pathway is {most_abund}"))
## The most differentially abundant pathway is PWY0-1338
all_path_lfc
abc_pathways
Is species diversity a good predictor of the pathway diversity in a site?
Hypothesis: the higher the diversity and evenness in a given site, the higher the functional diveristy
If they
faith %>%
t() %>%
as.data.frame() %>%
rownames_to_column(var = "site") %>%
pivot_longer(-site)
# shannon
# rownames(shannon)
# s <- shannon$`Bihor mountains`
# j <- ko$BhM %>% ko_to_div()
# j
#
# iqtree$CrC$edge.length %>% sum()
#
# cas <- iqtree$CaS
# crc <- iqtree$CrC
# TreeDistance(cas, crc)
#
# ko %>% lapply(ko_to_div)
# %>% as.numeric
# print() %>% lapply(diversity, index = "shannon")